#pragma once

#if defined(__MACACC__)
// #include "__clang_maca_mckl_lib_header.h"
#include "mc_runtime_types.h"
#include "mckl/mc_device_runtime.h"

extern "C"
{
__forceinline__ __device__ mcError_t wcudaDeviceGetAttribute(
    int *value, enum mcDeviceAttribute_t attr, int device)
{
        return mcDeviceGetAttribute(value, attr, device);
}
__forceinline__ __device__ mcError_t wcudaDeviceGetLimit(size_t *pValue, enum mcLimit_t limit)
{
    return mcDeviceGetLimit(pValue, limit);
}
__forceinline__ __device__ mcError_t wcudaDeviceGetCacheConfig(enum mcFuncCache_t *pCacheConfig)
{
    return mcDeviceGetCacheConfig(pCacheConfig);
}
__forceinline__ __device__ mcError_t wcudaDeviceGetSharedMemConfig(enum mcSharedMemConfig *pConfig)
{
    return mcDeviceGetSharedMemConfig(pConfig);
}
__forceinline__ __device__ mcError_t wcudaDeviceSynchronize(void) { return mcDeviceSynchronize(); }
__forceinline__ __device__ mcError_t wcudaGetLastError(void) { return mcGetLastError(); }
__forceinline__ __device__ mcError_t wcudaPeekAtLastError(void) { return mcPeekAtLastError(); }
__forceinline__ __device__ const char *wcudaGetErrorString(mcError_t error)
{
    return mcGetErrorString(error);
}
__forceinline__ __device__ const char *wcudaGetErrorName(mcError_t error)
{
    return mcGetErrorName(error);
}
__forceinline__ __device__ mcError_t wcudaGetDeviceCount(int *count)
{
    return mcGetDeviceCount(count);
}
__forceinline__ __device__ mcError_t wcudaGetDevice(int *device) { return mcGetDevice(device); }
__forceinline__ __device__ mcError_t wcudaStreamCreateWithFlags(
    mcStream_t *pStream __attribute__((unused)), unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaStreamDestroy(mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaStreamWaitEvent(
    mcStream_t stream __attribute__((unused)), mcEvent_t event __attribute__((unused)),
    unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaStreamWaitEvent_ptsz(
    mcStream_t stream __attribute__((unused)), mcEvent_t event __attribute__((unused)),
    unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventCreateWithFlags(
    mcEvent_t *event __attribute__((unused)), unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventRecord(mcEvent_t event __attribute__((unused)),
                                                      mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventRecord_ptsz(
    mcEvent_t event __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventRecordWithFlags(
    mcEvent_t event __attribute__((unused)), mcStream_t stream __attribute__((unused)),
    unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventRecordWithFlags_ptsz(
    mcEvent_t event __attribute__((unused)), mcStream_t stream __attribute__((unused)),
    unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaEventDestroy(mcEvent_t event __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaFuncGetAttributes(
    struct mcFuncAttributes *attr __attribute__((unused)), const void *func __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaFree(void *devPtr) { return mcFree(devPtr); }
__forceinline__ __device__ mcError_t wcudaMalloc(void **devPtr, size_t size)
{
    return mcMalloc(devPtr, size);
}
__forceinline__ __device__ mcError_t wcudaMemcpyAsync(void *dst, const void *src, size_t count,
                                                      enum _mcMemcpyKind kind, mcStream_t stream)
{
    return mcMemcpyAsync(dst, src, count, kind, stream);
}
__forceinline__ __device__ mcError_t wcudaMemcpyAsync_ptsz(
    void *dst __attribute__((unused)), const void *src __attribute__((unused)),
    size_t count __attribute__((unused)), enum _mcMemcpyKind kind __attribute__((unused)),
    mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src,
                                                        size_t spitch, size_t width, size_t height,
                                                        enum _mcMemcpyKind kind, mcStream_t stream)
{
    return mcMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
}
__forceinline__ __device__ mcError_t wcudaMemcpy2DAsync_ptsz(
    void *dst __attribute__((unused)), size_t dpitch __attribute__((unused)),
    const void *src __attribute__((unused)), size_t spitch __attribute__((unused)),
    size_t width __attribute__((unused)), size_t height __attribute__((unused)),
    enum _mcMemcpyKind kind __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaMemcpy3DAsync(const struct mcMemcpy3DParms *p,
                                                        mcStream_t stream)
{
    return mcMemcpy3DAsync(p, stream);
}
__forceinline__ __device__ mcError_t wcudaMemcpy3DAsync_ptsz(
    const struct mcMemcpy3DParms *p __attribute__((unused)),
    mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaMemsetAsync(void *devPtr, int value, size_t count,
                                                      mcStream_t stream)
{
    return mcMemsetAsync(devPtr, value, count, stream);
}
__forceinline__ __device__ mcError_t wcudaMemsetAsync_ptsz(
    void *devPtr __attribute__((unused)), int value __attribute__((unused)),
    size_t count __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaMemset2DAsync(void *devPtr, size_t pitch, int value,
                                                        size_t width, size_t height,
                                                        mcStream_t stream)
{
    return mcMemset2DAsync(devPtr, pitch, value, width, height, stream);
}
__forceinline__ __device__ mcError_t wcudaMemset2DAsync_ptsz(
    void *devPtr __attribute__((unused)), size_t pitch __attribute__((unused)),
    int value __attribute__((unused)), size_t width __attribute__((unused)),
    size_t height __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaMemset3DAsync(struct mcPitchedPtr pitchedDevPtr,
                                                        int value, struct mcExtent extent,
                                                        mcStream_t stream)
{
    return mcMemset3DAsync(pitchedDevPtr, value, extent, stream);
}
__forceinline__ __device__ mcError_t wcudaMemset3DAsync_ptsz(
    struct mcPitchedPtr pitchedDevPtr __attribute__((unused)), int value __attribute__((unused)),
    struct mcExtent extent __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaRuntimeGetVersion(int *runtimeVersion)
{
    return mcRuntimeGetVersion(runtimeVersion);
}
__forceinline__ __device__ void *wcudaGetParameterBuffer(size_t alignment, size_t size)
{
    return mcGetParameterBuffer(alignment, size);
}
__forceinline__ __device__ void *wcudaGetParameterBufferV2(void *func, dim3 gridDimension,
                                                           dim3 blockDimension,
                                                           unsigned int sharedMemSize)
{
    return mcGetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize);
}
__forceinline__ __device__ mcError_t wcudaLaunchDevice_ptsz(
    void *func __attribute__((unused)), void *parameterBuffer __attribute__((unused)),
    dim3 gridDimension __attribute__((unused)), dim3 blockDimension __attribute__((unused)),
    unsigned int sharedMemSize __attribute__((unused)), mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaLaunchDeviceV2_ptsz(
    void *parameterBuffer __attribute__((unused)),
    mcStream_t stream __attribute__((unused)))
{
    return mcErrorNotSupported;
}
__forceinline__ __device__ mcError_t wcudaLaunchDevice(void *func, void *parameterBuffer,
                                                       dim3 gridDimension, dim3 blockDimension,
                                                       unsigned int sharedMemSize,
                                                       mcStream_t stream)
{
    return mcLaunchDevice(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize,
                          stream);
}
__forceinline__ __device__ mcError_t wcudaLaunchDeviceV2(void *parameterBuffer, mcStream_t stream)
{
    return mcLaunchDeviceV2(parameterBuffer, stream);
}
__forceinline__ __device__ unsigned long long wcudaCGGetIntrinsicHandle(enum mcCGScope scope)
{
    return mcCGGetIntrinsicHandle(scope);
}
__forceinline__ __device__ mcError_t wcudaCGSynchronize(unsigned long long handle,
                                                        unsigned int flags)
{
    return mcCGSynchronize(handle, flags);
}
__forceinline__ __device__ mcError_t wcudaCGSynchronizeGrid(unsigned long long handle,
                                                            unsigned int flags)
{
    return mcCGSynchronizeGrid(handle, flags);
}
__forceinline__ __device__ mcError_t wcudaCGGetSize(unsigned int *numThreads,
                                                    unsigned int *numGrids,
                                                    unsigned long long handle)
{
    return mcCGGetSize(numThreads, numGrids, handle);
}
__forceinline__ __device__ mcError_t wcudaCGGetRank(unsigned int *threadRank,
                                                    unsigned int *gridRank,
                                                    unsigned long long handle)
{
    return mcCGGetRank(threadRank, gridRank, handle);
}
/* TODO: Those functions are not implemented by compiler or kernel lib yet. */
__forceinline__ __device__ unsigned int __pm0(void) { return 0; }
__forceinline__ __device__ unsigned int __pm1(void) { return 0; }
__forceinline__ __device__ unsigned int __pm2(void) { return 0; }
__forceinline__ __device__ unsigned int __pm3(void) { return 0; }
__forceinline__ __device__ void __prof_trigger(int) {}
}

template <typename T>
static __device__ mcError_t wcudaMalloc(T **devPtr __attribute__((unused)),
                                        size_t size __attribute__((unused)))
{
    return mcErrorNotSupported;
}
template <typename T>
static __device__ mcError_t wcudaFuncGetAttributes(
    struct mcFuncAttributes *attr __attribute__((unused)), T *entry __attribute__((unused)))
{
    return mcErrorNotSupported;
}
template <typename T>
static __device__ mcError_t wcudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks,
                                                                           T func, int blockSize,
                                                                           size_t dynamicSmemSize)
{
    return mcOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSmemSize);
}
template <typename T>
static __device__ mcError_t wcudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
    int *numBlocks __attribute__((unused)), T func __attribute__((unused)),
    int blockSize __attribute__((unused)), size_t dynamicSmemSize __attribute__((unused)),
    unsigned int flags __attribute__((unused)))
{
    return mcErrorNotSupported;
}

#endif
